#### 19th Presidential Election Replication #### 0811 SJ

library(tidyverse)
library(readstata13)
library(stargazer)
library(dplyr)
library(ggplot2)
library(dplyr)
library(readxl)
library(forcats)

setwd("C:/Sinjae Kang/study hard/작업 완료 논문들/(2025.04)대선토론(with 지영, 김정현 교수님)/Harvard Daterverse")
data19 <- read_xlsx("addExcel_2553.xlsx")
str(data19)

## TV Debate
table(data19$Q42)
data19 <- data19 %>%
  mutate(num.watch = case_when(
    Q42 == 1 ~ 3,  # 5–6 times or more
    Q42 == 2 ~ 2,  # 3–4 times
    Q42 == 3 ~ 1,  # 1–2 times
    Q42 == 4 ~ 0   # Did not watch at all
  ))

psych::describe(data19$num.watch)

## Policy and Pledge-Based Voting
table(data19$Q6)
data19 <- data19 %>%
  mutate(policy.based = case_when(
    Q6 == 1 ~ 1,  # Policy and pledges
    Q6 %in% c(2, 3, 4, 5, 6, 7) ~ 0,  # Other responses
    Q6 %in% c(98, 9999) ~ NA_real_  # None or missing → treated as NA
  ))

table(data19$policy.based, useNA = "always")

## info.source - Election-related information sources
library(tidyverse)

#### FINAL INFO GROUP? ####
# 1) Specify Q12A1~Q12A15 + 9999 → NA
q12_vars <- paste0("Q12A", 1:15)
data19 <- data19 %>%
  mutate(across(all_of(q12_vars), ~ na_if(as.integer(.), 9999))) %>%
  # 2) Count selections of 1/3/others by row
  mutate(
    n1      = rowSums(across(all_of(q12_vars), ~ . == 1), na.rm = TRUE),
    n3      = rowSums(across(all_of(q12_vars), ~ . == 3), na.rm = TRUE),
    n_sel   = rowSums(across(all_of(q12_vars), ~ !is.na(.))),  # Total selections
    n_other = n_sel - n1 - n3                                  # Count excluding 1/3
  ) %>%
  # 3) Final 4 categories (order matters!)
  mutate(
    info_group_final = case_when(
      n1 >= 1 & n3 >= 1                ~ "TV_and_Portal_plus_other",   # 1+3 (regardless of other options)
      n1 >= 1 & n3 == 0                ~ "TV_plus_other_no3",         # Only 1 or 1+others (excluding 3)
      n1 == 0 & n3 >= 1                ~ "Portal_plus_other_no1",      # Only 3 or 3+others (excluding 1)
      TRUE                             ~ "Others"                      # All others
    ),
    info_group_final = factor(
      info_group_final,
      levels = c("TV_plus_other_no3", "Portal_plus_other_no1",
                 "TV_and_Portal_plus_other", "Others")
    )
  ) %>%
  dplyr::select(-n1, -n3, -n_sel, -n_other)

# Check distribution
table(data19$info_group_final, useNA = "ifany")

# Set reference category as Others
data19 <- data19 %>%
  mutate(info_group_final = fct_relevel(info_group_final, "TV_and_Portal_plus_other"))


## Female
data19 <- data19 %>%
  mutate(female = case_when(
    DQ3 == 1 ~ 0,
    DQ3 == 2 ~ 1,
    DQ3 %in% c(98, 9999) ~ NA_real_
  ))

table(data19$female, useNA = "always")

## Voting status
# Voted: voted=1, did not vote=0
data19 <- data19 %>%
  mutate(voted = case_when(
    Q4 == 1 ~ 1,
    Q4 == 2 ~ 0,
    Q4 %in% c(98, 9999) ~ NA_real_,
    TRUE ~ NA_real_
  ))

table(data19$voted, useNA = "always")

## Age
data19 <- data19 %>%
  mutate(AGE_T1 = ifelse(SQ12 %in% c(98, 9999), NA,
                         2017 - as.numeric(SQ12)))
summary(data19$AGE_T1)

## Education, 4-year college or higher
data19 <- data19 %>%
  mutate(College = case_when(
    SQ1 %in% c(4, 5, 6)       ~ 1,
    SQ1 %in% c(1, 2, 3) ~ 0,
    SQ1 %in% c(98, 9999)   ~ NA_real_,
    TRUE                   ~ NA_real_
  ))

# Check
table(data19$College, useNA = "always")

## Income
# Income (ordinal numeric: higher = higher income)
data19 <- data19 %>%
  mutate(Income_raw = as.numeric(SQ6),
         Income = ifelse(Income_raw %in% c(98, 9999), NA, Income_raw)) %>%
  dplyr::select(-Income_raw)

summary(data19$Income)

## Ideological orientation (higher value = more conservative)
data19 <- data19 %>%
  mutate(VoterIdeo = ifelse(Q54A6 %in% c(98, 9999), NA, as.numeric(Q54A6)))
summary(data19$VoterIdeo)

## Government trust (higher = more trust)
data19 <- data19 %>%
  mutate(gov.trust = case_when(
    Q22A3 %in% 1:4 ~ as.numeric(Q22A3),   # 1=dissatisfied … 4=very satisfied
    Q22A3 == 9 ~ NA_real_,
    TRUE ~ NA_real_
  ))
table(data19$gov.trust, useNA = "ifany")
table(data19$Q22A3)

## Conservative candidate vote (Hong Joon-pyo, corresponding to Yoon in 20th election)
# Conservative candidate vote dummy (corresponding to Yoon): 1 if Hong Joon-pyo (=2), 0 otherwise
data19 <- data19 %>%
  mutate(Hong = case_when(
    Q7 == 2 ~ 1,                       # Hong Joon-pyo
    Q7 %in% c(1,3,4,5,6) ~ 0,          # Moon/Ahn/Yoo/Sim/Others
    Q7 %in% c(98, 9999) ~ NA_real_,
    TRUE ~ NA_real_
  ))

table(data19$Hong, useNA = "always")

## Democratic satisfaction (proxy)
# Safe handling of missing values + reverse coding (1→4, 2→3, 3→2, 4→1)
data19 <- data19 %>%
  mutate(
    demo_satisfaction = case_when(
      Q50A14 %in% 1:4 ~ 5 - as.numeric(Q50A14),   # Higher value = higher support/satisfaction with democracy
      Q50A14 %in% c(8, 9, 98, 9999) ~ NA_real_,
      TRUE ~ NA_real_
    )
  )

# Check
table(data19$demo_satisfaction, useNA = "always")

## Perception of policy competition intensity
# Policy competition perception (higher = stronger competition), based on Q21C
# If variable name differs, replace Q21C with the appropriate variable name
data19 <- data19 %>%
  mutate(policy_competition = case_when(
    Q21C %in% 1:4 ~ 5 - as.numeric(Q21C),   # 1→4, 2→3, 3→2, 4→1
    Q21C %in% c(8, 9, 98, 9999) ~ NA_real_,
    TRUE ~ NA_real_
  ))

# Check
table(data19$policy_competition, useNA = "always")

#### Analysis ####
data19 <- data19 %>%
  mutate(
    Edu  = College,   # 4-year college or higher dummy in place of Edu in 20th election code
    Yoon = Hong       # Conservative candidate (Hong Joon-pyo) dummy in place of Yoon in 20th election code
  )

# Create subsamples
hong_data <- data19 %>%
  filter(Hong == 1) %>%                # Hong Joon-pyo (conservative) voters
  mutate(across(where(is.factor), forcats::fct_drop))  # (Optional) Clean unused factor levels

moon_data <- data19 %>%
  filter(Q7 == 1) %>%                  # Moon Jae-in voters
  mutate(across(where(is.factor), forcats::fct_drop))

# Check
nrow(hong_data); nrow(moon_data)

#### Descriptive Statistics (based on regression model sample) ####
library(psych)
library(dplyr)

# =============================================================================
# Extract only samples used in regression analysis (remove missing values)
# =============================================================================
# Select only variables used in regression model and remove missing values
analysis_data_19 <- data19 %>%
  dplyr::select(policy.based, num.watch, info_group_final, AGE_T1, Income, Edu, female,
                VoterIdeo, gov.trust, Hong, demo_satisfaction, policy_competition) %>%
  filter(complete.cases(.))

cat("19th Presidential Election full data N:", nrow(data19), "\n")
cat("19th Presidential Election regression analysis data N:", nrow(analysis_data_19), "\n\n")

# =============================================================================
# Convert info_group_final to dummy variables (include all categories)
# =============================================================================
analysis_data_19 <- analysis_data_19 %>%
  mutate(
    info_tv_portal = ifelse(info_group_final == "TV_and_Portal_plus_other", 1, 0),
    info_tv_based = ifelse(info_group_final == "TV_plus_other_no3", 1, 0),
    info_portal_based = ifelse(info_group_final == "Portal_plus_other_no1", 1, 0),
    info_others = ifelse(info_group_final == "Others", 1, 0)
  )

# =============================================================================
# Descriptive statistics including all variables (in desired order)
# =============================================================================
continuous_vars_19 <- analysis_data_19 %>% 
  dplyr::select(
    policy.based,           # Policy Consideration
    info_tv_portal,         # Info Source (TV & Portal)
    info_tv_based,          # Info Source (TV-based)
    info_portal_based,      # Info Source (Portal-based)
    info_others,            # Info Source (Others)
    num.watch,              # Watching Debate  
    AGE_T1,                 # Age
    Income,                 # Income
    Edu,                    # College
    female,                 # Female
    VoterIdeo,              # Ideology
    gov.trust,              # Government trust
    Hong,                   # Voted for Hong (conservative candidate)
    demo_satisfaction,      # Democratic satisfaction
    policy_competition      # Policy competition
  ) %>%
  psych::describe()

print("=== 19th Presidential Election Descriptive Statistics ===")
print(continuous_vars_19)

# =============================================================================
# Check info_group_final distribution
# =============================================================================
cat("\n=== Info Group Final Distribution ===\n")
table(analysis_data_19$info_group_final, useNA = "ifany")
prop.table(table(analysis_data_19$info_group_final, useNA = "ifany")) * 100

# =============================================================================
# Summary for publication table
# =============================================================================
cat("\n=== Summary for Publication Table ===\n")

# Continuous variable summary (Mean, SD, Min, Max, N)
continuous_summary_19 <- analysis_data_19 %>% 
  dplyr::select(policy.based, info_tv_portal, info_tv_based, info_portal_based, info_others,
                num.watch, AGE_T1, Income, Edu, female, VoterIdeo, gov.trust, Hong, 
                demo_satisfaction, policy_competition) %>%
  summarise_all(list(
    Mean = ~mean(.x, na.rm = TRUE),
    SD = ~sd(.x, na.rm = TRUE),
    Min = ~min(.x, na.rm = TRUE),
    Max = ~max(.x, na.rm = TRUE),
    N = ~sum(!is.na(.x))
  )) %>%
  gather(key, value) %>%
  separate(key, into = c("Variable", "Statistic"), sep = "_(?=[^_]+$)", extra = "merge") %>%  # Split by last _
  spread(Statistic, value) %>%
  dplyr::select(Variable, Mean, SD, Min, Max, N)

print(continuous_summary_19)



#### Regression Analysis ####
# Analysis 1. Election information sources - final version
mod1_19_final <- glm(policy.based ~ num.watch + info_group_final +
                       AGE_T1 + Income + Edu + female +
                       VoterIdeo + gov.trust + Hong +
                       demo_satisfaction + policy_competition,
                     data = data19, family = "binomial")
stargazer(mod1_19_final, type = "text")

mod2_19_final <- glm(policy.based ~ num.watch + info_group_final +
                       AGE_T1 + Income + Edu + female +
                       VoterIdeo + gov.trust,
                     data = hong_data, family = "binomial")
stargazer(mod2_19_final, type = "text")

mod3_19_final <- glm(policy.based ~ num.watch + info_group_final +
                       AGE_T1 + Income + Edu + female +
                       VoterIdeo + gov.trust,
                     data = moon_data, family = "binomial")

stargazer(mod3_19_final, type = "text") # Significant negative effect of news among Moon Jae-in supporters!

stargazer(mod1_19_final, mod2_19_final, mod3_19_final,
          type = "html",
          title = "Analysis of Policy-Based Voting by Election Information Source",
          column.labels = c("All", "Hong Supporters", "Moon Supporters"),
          covariate.labels = c("TV Debate Viewing Frequency", 
                               "TV Debates + Others (No Portal)",
                               "Portal + Others (No TV)", 
                               "Others",
                               "Age", "Income", "Education", "Female",
                               "Ideology", "Government Trust", "Hong Support",
                               "Democratic Satisfaction", "Policy Competition", "Constant"),
          dep.var.labels = "Policy-Based Voting",
          star.cutoffs = c(0.05, 0.01, 0.001),
          digits = 3,
          out = "election_info_analysis.html")

getwd()

#### Visualization ####
library(dplyr)
library(forcats)
library(ggplot2)

## 0) Fix reference category as "TV_and_Portal_plus_other"
relevel_info <- function(df) {
  df %>%
    mutate(
      info_group_final = fct_relevel(
        info_group_final,
        "TV_and_Portal_plus_other",   # ← New reference
        "TV_plus_other_no3",
        "Portal_plus_other_no1",
        "Others"
      )
    )
}

data19   <- relevel_info(data19)
if (!exists("hong_data")) hong_data <- data19 %>% filter(Hong == 1)   # Keep if needed
if (!exists("moon_data")) moon_data <- data19 %>% filter(Q7 == 1)     # Keep if needed
hong_data <- relevel_info(hong_data)
moon_data <- relevel_info(moon_data)

## 1) Refit models (recommended as coefficient interpretation changes with reference category change)
mod1_19_final <- glm(
  policy.based ~ num.watch + info_group_final +
    AGE_T1 + Income + Edu + female +
    VoterIdeo + gov.trust + Hong +
    demo_satisfaction + policy_competition,
  data = data19, family = "binomial"
)

mod2_19_final <- glm(
  policy.based ~ num.watch + info_group_final +
    AGE_T1 + Income + Edu + female +
    VoterIdeo + gov.trust,
  data = hong_data, family = "binomial"
)

mod3_19_final <- glm(
  policy.based ~ num.watch + info_group_final +
    AGE_T1 + Income + Edu + female +
    VoterIdeo + gov.trust,
  data = moon_data, family = "binomial"
)

## 2) Function for plotting (also change category order so new reference appears first)
make_pred_plot <- function(model, data, title = "", filename = NULL,
                           order = c("TV_and_Portal_plus_other",
                                     "TV_plus_other_no3",
                                     "Portal_plus_other_no1",
                                     "Others")) {
  
  present_lvls <- model$xlevels$info_group_final
  if (is.null(present_lvls)) stop("info_group_final is not included as factor in the model.")
  
  tt <- terms(model)
  vars_in_model <- attr(tt, "term.labels")
  base_vals <- list()
  for (v in vars_in_model) {
    if (v == "info_group_final") next
    if (grepl(":", v) || grepl("\\(", v)) next
    if (!v %in% names(data)) next
    base_vals[[v]] <- if (is.numeric(data[[v]])) mean(data[[v]], na.rm = TRUE)
    else data[[v]][[which.max(table(data[[v]]))]]
  }
  base_df <- as.data.frame(base_vals)
  
  cats <- intersect(order, present_lvls)
  pred_df <- base_df[rep(1, length(cats)), , drop = FALSE]
  pred_df$info_group_final <- factor(cats, levels = present_lvls)
  
  pr_link <- predict(model, newdata = pred_df, type = "link", se.fit = TRUE)
  out <- pred_df %>%
    mutate(
      source   = factor(as.character(info_group_final), levels = order),
      fit_link = pr_link$fit,
      se_link  = pr_link$se.fit,
      pred     = plogis(fit_link),
      lower    = plogis(fit_link - 1.96 * se_link),
      upper    = plogis(fit_link + 1.96 * se_link)
    ) %>% arrange(source)
  
  p <- ggplot(out, aes(x = source, y = pred)) +
    geom_point(size = 3, color = "darkblue") +
    geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, size = 1, color = "darkblue") +
    geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
    # ▼ Replace labels
    scale_x_discrete(labels = c(
      "TV_and_Portal_plus_other" = "TV & Portal",
      "TV_plus_other_no3"        = "TV Debates-based",
      "Portal_plus_other_no1"    = "Portal News-based",
      "Others"                   = "Others"
    )) +
    labs(
      title = title,
      x = "Information Source",
      y = "Predicted Probability of Policy-Based Voting") +
    theme_minimal(base_size = 16) +
    theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
  if (!is.null(filename)) ggsave(filename, p, width = 9, height = 6, dpi = 300)
  p
}

## 3) Generate figures (publication style: point + 95% CI)
p_all  <- make_pred_plot(mod1_19_final, data19,
                         title = "Marginal Effects of Information Sources on Policy-Based Voting",
                         filename = "figure4.png")

p_hong <- make_pred_plot(mod2_19_final, hong_data,
                         title = "Marginal Effects of Information Sources on Policy-Based Voting (Hong)",
                         filename = "fig19_hong_info_groups_refTV&Portal.png")

p_moon <- make_pred_plot(mod3_19_final, moon_data,
                         title = "Marginal Effects of Information Sources on Policy-Based Voting (Moon)",
                         filename = "fig19_moon_info_groups_refTV&Portal.png")

p_all; p_hong; p_moon
